---
title: Data extraction
description: We proceed to extract dataset using WorldFootballR library from Fbref and ...
---

Nous collectons les données de Fbref et Transfermarkt en utilisant la bibliothèque WorldFootballR. 

Nous collectons des données de 2015 à 2023 auprès des principales ligues européennes de première division : Angleterre, Espagne, Italie, Allemagne, France, Portugal, Écosse, Pologne, Grèce, Turquie, Suisse, Pays-Bas, Belgique, Autriche.

In [1]:
if (!require(devtools, quietly = TRUE)) {
    install.packages("devtools")
    library(devtools)
}

if (!require(worldfootballR)) { 
    devtools::install_github("JaseZiv/worldfootballR")
    library(worldfootballR)
}

if (!require(readr)) {
  install.packages("readr")
  library(readr)
}

Le chargement a nécessité le package : worldfootballR

Le chargement a nécessité le package : readr



### Collecting match results

In [3]:
# Change parameter to study different teams and seaons 
# country <- c("ENG", "ESP", "ITA", "GER", "FRA", "POR", "SCO", "POL", "GRE", "SUI", "NED", "BEL", "AUT")
# year <- c(2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023)

country <- c("ENG", "ESP", "ITA", "GER", "FRA")
year <- c(2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025)
match_result <- load_match_results(country = country, gender = "M", season_end_year = year, tier = "1st")

→ Data last updated 2025-02-04 17:32:08.29795408248901 UTC



In [4]:
columns_to_keep <- c('Competition_Name', 'Country', 'Season_End_Year', 'Date', 'Home', 'HomeGoals', 'Away', 'AwayGoals')
match_result <- match_result[, columns_to_keep]
# Rename columns
colnames(match_result) <- c('league', 'country', 'season', 'date', 'home', 'home_goals', 'away', 'away_goals')
head(match_result)

Unnamed: 0_level_0,league,country,season,date,home,home_goals,away,away_goals
Unnamed: 0_level_1,<chr>,<chr>,<int>,<date>,<chr>,<dbl>,<chr>,<dbl>
1,Premier League,ENG,2015,2014-08-16,Manchester Utd,1,Swansea City,2
2,Premier League,ENG,2015,2014-08-16,Stoke City,0,Aston Villa,1
3,Premier League,ENG,2015,2014-08-16,Leicester City,2,Everton,2
4,Premier League,ENG,2015,2014-08-16,QPR,0,Hull City,1
5,Premier League,ENG,2015,2014-08-16,West Ham,0,Tottenham,1
6,Premier League,ENG,2015,2014-08-16,West Brom,2,Sunderland,2


In [None]:
unique(match_result$league)
# Reformat Bundesliga
match_result$league <- gsub("Fußball-Bundesliga", "Bundesliga", match_result$league)

In [7]:
summary(match_result)

    league            country              season          date           
 Length:19969       Length:19969       Min.   :2015   Min.   :2014-08-08  
 Class :character   Class :character   1st Qu.:2017   1st Qu.:2017-03-18  
 Mode  :character   Mode  :character   Median :2020   Median :2019-12-21  
                                       Mean   :2020   Mean   :2019-12-28  
                                       3rd Qu.:2023   3rd Qu.:2022-09-30  
                                       Max.   :2025   Max.   :2025-05-25  
                                                                          
     home             home_goals         away             away_goals   
 Length:19969       Min.   : 0.000   Length:19969       Min.   :0.000  
 Class :character   1st Qu.: 1.000   Class :character   1st Qu.:0.000  
 Mode  :character   Median : 1.000   Mode  :character   Median :1.000  
                    Mean   : 1.542                      Mean   :1.228  
                    3rd Qu.: 2.000      

In [8]:
# Saving the data
write_csv(match_result, "../data/extracted_match_results.csv")

### Collecting head coach data

In [2]:
countries <- c("England", "Spain", "Italy", "Germany", "France")

get_team_url <- function(country) {
    tryCatch({
        tm_league_team_urls(country_name = country, start_year = 2015)
    }, error = function(e) {
        warning("Failed to fetch URLs for ", country, ": ", e$message)
        character(0)
    })
}

teams_url <- unlist(lapply(countries, get_team_url))
head(teams_url)

In [3]:
get_team_staff_url <- function(team_url) {
    tryCatch({
        tm_team_staff_urls(team_urls = team_url, staff_role = "Manager")
    }, error = function(e) {
        warning("Failed to fetch staff URLs for ", team_url, ": ", e$message)
        data.frame()
    })
}

teams_staff_url <- unlist(lapply(teams_url, get_team_staff_url))
head(teams_staff_url)

In [4]:
head_coach <- tm_team_staff_history(team_urls = teams_staff_url, staff_role = "Manager")
unique(head_coach$league)
unique(head_coach$team)

Error: indice hors limites

Error: indice hors limites

Error: indice hors limites

Error: indice hors limites

Error: indice hors limites

Error: indice hors limites

Error: indice hors limites



There is some missing information about country and league in the data. We will add this information manually.

In [15]:
sapply(head_coach, function(x) sum(is.na(x)))
# Show unique teams with missing league and or country
unique(head_coach$team[is.na(head_coach$league) | is.na(head_coach$country)])

In [16]:
# Fix league and country for 'Chievo Verona' and 'GFC Ajaccio'
head_coach$league[head_coach$team == 'Chievo Verona'] <- 'Serie A'
head_coach$country[head_coach$team == 'Chievo Verona'] <- 'Italy'
head_coach$league[head_coach$team == 'GFC Ajaccio'] <- 'Ligue 2'
head_coach$country[head_coach$team == 'GFC Ajaccio'] <- 'France'

Filter leagues that are not First Division Leagues

In [17]:
# Filter teams that are not First Division teams
# first_division_teams <- c(
#     'Premier League', 'LaLiga', 'Serie A', 'Bundesliga', 'Ligue 1', 
#     'Liga Portugal', 'Scottish Premiership', 'PKO BP Ekstraklasa', 'Super League 1', 
#     'Super League', 'Eredivisie', 'Jupiler Pro League')

first_division_teams <- c('Premier League', 'LaLiga', 'Serie A', 'Bundesliga', 'Ligue 1')
# Ensure the every first_division_teams is in the head_coach$league
all(first_division_teams %in% head_coach$league)
# Filter the head_coach data
head_coach <- head_coach[head_coach$league %in% first_division_teams, ]
head(head_coach, 5)

Unnamed: 0_level_0,team_name,league,country,staff_role,staff_name,staff_url,staff_dob,staff_nationality,staff_nationality_secondary,appointed,end_date,days_in_post,matches,wins,draws,losses,ppg
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<date>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Chelsea FC,Premier League,England,Manager,Mauricio Pochettino,https://www.transfermarkt.com/mauricio-pochettino/profil/trainer/9044,"Mar 2, 1972",Argentina,Spain,2023-07-01,,296,44,22,9,13,1.7
2,Chelsea FC,Premier League,England,Manager,Graham Potter,https://www.transfermarkt.com/graham-potter/profil/trainer/23954,"May 20, 1975",England,,2022-09-08,2023-04-02,206,31,12,8,11,1.42
3,Chelsea FC,Premier League,England,Manager,Thomas Tuchel,https://www.transfermarkt.com/thomas-tuchel/profil/trainer/7471,"Aug 29, 1973",Germany,,2021-01-26,2022-09-07,589,100,63,19,18,2.08
4,Chelsea FC,Premier League,England,Manager,Frank Lampard,https://www.transfermarkt.com/frank-lampard/profil/trainer/60805,"Jun 20, 1978",England,,2019-07-04,2021-01-25,571,84,44,15,25,1.75
5,Chelsea FC,Premier League,England,Manager,Maurizio Sarri,https://www.transfermarkt.com/maurizio-sarri/profil/trainer/10073,"Jan 10, 1959",Italy,,2018-07-14,2019-06-30,351,63,40,11,12,2.08


In [18]:
columns_to_keep <- c('team_name', 'league', 'country', 'staff_name', 'appointed', 'end_date', 'days_in_post', 'matches', 'wins', 'draws', 'losses')
head_coach <- head_coach[, columns_to_keep]

# Rename columns
colnames(head_coach) <- c('Team', 'League', 'Country', 'HeadCoach', 'Appointed', 'EndDate', 'Tenure', 'Matches', 'Wins', 'Draws', 'Losses')

summary(head_coach)

     Team              League            Country           HeadCoach        
 Length:3532        Length:3532        Length:3532        Length:3532       
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
   Appointed             EndDate               Tenure           Matches       
 Min.   :1886-06-26   Min.   :1893-08-01   Min.   : -242.0   Min.   :   0.00  
 1st Qu.:1961-11-02   1st Qu.:1963-06-30   1st Qu.:  186.0   1st Qu.:  10.00  
 Median :1987-07-01   Median :1988-03-06   Median :  364.0   Median :  29.00  
 Mean   :1982-05-15   Mean   :1983-04-16   Mean   :  608.2   Mean   

In [None]:
# Saving the data
write_csv(head_coach, "../data/extracted_head_coach.csv")